Create a new analysis directory...
[1] FALSE
[1] FALSE
[1] FALSE
[1] FALSE
[1] FALSE
[1] "/Users/swvanderlaan/git/CirculatoryHealth/AE_20211201_YAW_SWVANDERLAAN_HDAC9"
[1] "_archived" "1. AE_20211201_YAW_SWVANDERLAAN_HDAC9.nb.html" "1. AE_20211201_YAW_SWVANDERLAAN_HDAC9.Rmd"
[4] "2. SNP_analyses.Rmd" "20220317.HDAC9.AESCRNA.results.RData" "20220317.HDAC9.baseline.RData"
[7] "3.1 bulkRNAseq.preparation.nb.html" "3.1 bulkRNAseq.preparation.Rmd" "3.2 bulkRNAseq.main_analysis.nb.html"
[10] "3.2 bulkRNAseq.main_analysis.Rmd" "3.3 bulkRNAseq.additional_figures.Rmd" "3.4 bulkRNAseq.review_comments.Rmd"
[13] "4. report.scrnaseq.nb.html" "4. report.scrnaseq.Rmd" "AE_20211201_YAW_SWVANDERLAAN_HDAC9.Rproj"
[16] "AnalysisPlan" "HDAC9" "images"
[19] "LICENSE" "README.html" "README.md"
[22] "references.bib" "renv" "renv.lock"
[25] "scripts" "SNP" "targets"
source(paste0(PROJECT_loc, "/scripts/functions.R"))install.packages.auto("pander")Loading required package: pander
install.packages.auto("readr")Loading required package: readr
install.packages.auto("optparse")Loading required package: optparse
install.packages.auto("tools")Loading required package: tools
install.packages.auto("dplyr")Loading required package: dplyr
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
install.packages.auto("tidyr")Loading required package: tidyr
install.packages.auto("naniar")Loading required package: naniar
# To get 'data.table' with 'fwrite' to be able to directly write gzipped-files
# Ref: https://stackoverflow.com/questions/42788401/is-possible-to-use-fwrite-from-data-table-with-gzfile
# install.packages("data.table", repos = "https://Rdatatable.gitlab.io/data.table")
library(data.table)Registered S3 method overwritten by 'data.table':
method from
print.data.table
data.table 1.14.2 using 1 threads (see ?getDTthreads). Latest news: r-datatable.com
**********
This installation of data.table has not detected OpenMP support. It should still work but in single-threaded mode.
This is a Mac. Please read https://mac.r-project.org/openmp/. Please engage with Apple and ask them for support. Check r-datatable.com for updates, and our Mac instructions here: https://github.com/Rdatatable/data.table/wiki/Installation. After several years of many reports of installation problems on Mac, it's time to gingerly point out that there have been no similar problems on Windows or Linux.
**********
Attaching package: 'data.table'
The following objects are masked from 'package:dplyr':
between, first, last
install.packages.auto("tidyverse")Loading required package: tidyverse
── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✓ ggplot2 3.3.5 ✓ stringr 1.4.0
✓ tibble 3.1.6 ✓ forcats 0.5.1
✓ purrr 0.3.4
── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x data.table::between() masks dplyr::between()
x dplyr::filter() masks stats::filter()
x data.table::first() masks dplyr::first()
x dplyr::lag() masks stats::lag()
x data.table::last() masks dplyr::last()
x purrr::transpose() masks data.table::transpose()
install.packages.auto("knitr")Loading required package: knitr
install.packages.auto("DT")Loading required package: DT
Registered S3 method overwritten by 'htmlwidgets':
method from
print.htmlwidget tools:rstudio
install.packages.auto("eeptools")Loading required package: eeptools
Welcome to eeptools for R version 1.2.0!
Developed by Jared E. Knowles 2012-2018
for the Wisconsin Department of Public Instruction
Distributed without warranty.
install.packages.auto("openxlsx")Loading required package: openxlsx
install.packages.auto("haven")Loading required package: haven
install.packages.auto("tableone")Loading required package: tableone
install.packages.auto("sjPlot")Loading required package: sjPlot
install.packages.auto("BlandAltmanLeh")Loading required package: BlandAltmanLeh
# Install the devtools package from Hadley Wickham
install.packages.auto('devtools')Loading required package: devtools
Loading required package: usethis
# for plotting
install.packages.auto("pheatmap")Loading required package: pheatmap
install.packages.auto("forestplot")Loading required package: forestplot
Loading required package: grid
Loading required package: magrittr
Attaching package: 'magrittr'
The following object is masked from 'package:purrr':
set_names
The following object is masked from 'package:tidyr':
extract
Loading required package: checkmate
install.packages.auto("ggplot2")
install.packages.auto("ggpubr")Loading required package: ggpubr
install.packages.auto("UpSetR")Loading required package: UpSetR
devtools::install_github("thomasp85/patchwork")Using github PAT from envvar GITHUB_PAT
Skipping install of 'patchwork' from a github remote, the SHA1 (79223d30) has not changed since last install.
Use `force = TRUE` to force installation
# for Seurat etc
install.packages.auto("GenomicFeatures")Loading required package: GenomicFeatures
Loading required package: BiocGenerics
Attaching package: 'BiocGenerics'
The following objects are masked from 'package:dplyr':
combine, intersect, setdiff, union
The following objects are masked from 'package:stats':
IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':
anyDuplicated, append, as.data.frame, basename, cbind, colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find, get, grep, grepl, intersect,
is.unsorted, lapply, Map, mapply, match, mget, order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce, rownames, sapply, setdiff,
sort, table, tapply, union, unique, unsplit, which.max, which.min
Loading required package: S4Vectors
Loading required package: stats4
Attaching package: 'S4Vectors'
The following objects are masked from 'package:data.table':
first, second
The following object is masked from 'package:tidyr':
expand
The following objects are masked from 'package:dplyr':
first, rename
The following objects are masked from 'package:base':
expand.grid, I, unname
Loading required package: IRanges
Attaching package: 'IRanges'
The following object is masked from 'package:purrr':
reduce
The following object is masked from 'package:data.table':
shift
The following objects are masked from 'package:dplyr':
collapse, desc, slice
Loading required package: GenomeInfoDb
Loading required package: GenomicRanges
Loading required package: AnnotationDbi
Loading required package: Biobase
Welcome to Bioconductor
Vignettes contain introductory material; view with 'browseVignettes()'. To cite Bioconductor, see 'citation("Biobase")', and for packages
'citation("pkgname")'.
Attaching package: 'Biobase'
The following object is masked from 'package:checkmate':
anyMissing
Attaching package: 'AnnotationDbi'
The following object is masked from 'package:dplyr':
select
install.packages.auto("GenomicRanges")
install.packages.auto("SummarizedExperiment")Loading required package: SummarizedExperiment
Loading required package: MatrixGenerics
Loading required package: matrixStats
Attaching package: 'matrixStats'
The following objects are masked from 'package:Biobase':
anyMissing, rowMedians
The following object is masked from 'package:checkmate':
anyMissing
The following object is masked from 'package:dplyr':
count
Attaching package: 'MatrixGenerics'
The following objects are masked from 'package:matrixStats':
colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse, colCounts, colCummaxs, colCummins, colCumprods, colCumsums, colDiffs, colIQRDiffs, colIQRs,
colLogSumExps, colMadDiffs, colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats, colProds, colQuantiles, colRanges, colRanks, colSdDiffs,
colSds, colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads, colWeightedMeans, colWeightedMedians, colWeightedSds, colWeightedVars, rowAlls,
rowAnyNAs, rowAnys, rowAvgsPerColSet, rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods, rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs,
rowLogSumExps, rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins, rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks, rowSdDiffs,
rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars, rowWeightedMads, rowWeightedMeans, rowWeightedMedians, rowWeightedSds, rowWeightedVars
The following object is masked from 'package:Biobase':
rowMedians
install.packages.auto("DESeq2")Loading required package: DESeq2
install.packages.auto("org.Hs.eg.db")Loading required package: org.Hs.eg.db
install.packages.auto("mygene")Loading required package: mygene
install.packages.auto("TxDb.Hsapiens.UCSC.hg19.knownGene")Loading required package: TxDb.Hsapiens.UCSC.hg19.knownGene
install.packages.auto("org.Hs.eg.db")
install.packages.auto("AnnotationDbi")
install.packages.auto("EnsDb.Hsapiens.v86")Loading required package: EnsDb.Hsapiens.v86
Loading required package: ensembldb
Loading required package: AnnotationFilter
Attaching package: 'AnnotationFilter'
The following object is masked from 'package:magrittr':
not
Attaching package: 'ensembldb'
The following object is masked from 'package:openxlsx':
addFilter
The following object is masked from 'package:dplyr':
filter
The following object is masked from 'package:stats':
filter
install.packages.auto("EnhancedVolcano")Loading required package: EnhancedVolcano
Loading required package: ggrepel
Registered S3 methods overwritten by 'ggalt':
method from
grid.draw.absoluteGrob ggplot2
grobHeight.absoluteGrob ggplot2
grobWidth.absoluteGrob ggplot2
grobX.absoluteGrob ggplot2
grobY.absoluteGrob ggplot2
Today = format(as.Date(as.POSIXlt(Sys.time())), "%Y%m%d")
Today.Report = format(as.Date(as.POSIXlt(Sys.time())), "%A, %B %d, %Y")
### UtrechtScienceParkColoursScheme
###
### WebsitetoconvertHEXtoRGB:http://hex.colorrrs.com.
### Forsomefunctionsyoushoulddividethesenumbersby255.
###
### No. Color HEX (RGB) CHR MAF/INFO
###---------------------------------------------------------------------------------------
### 1 yellow #FBB820 (251,184,32) => 1 or 1.0>INFO
### 2 gold #F59D10 (245,157,16) => 2
### 3 salmon #E55738 (229,87,56) => 3 or 0.05<MAF<0.2 or 0.4<INFO<0.6
### 4 darkpink #DB003F ((219,0,63) => 4
### 5 lightpink #E35493 (227,84,147) => 5 or 0.8<INFO<1.0
### 6 pink #D5267B (213,38,123) => 6
### 7 hardpink #CC0071 (204,0,113) => 7
### 8 lightpurple #A8448A (168,68,138) => 8
### 9 purple #9A3480 (154,52,128) => 9
### 10 lavendel #8D5B9A (141,91,154) => 10
### 11 bluepurple #705296 (112,82,150) => 11
### 12 purpleblue #686AA9 (104,106,169) => 12
### 13 lightpurpleblue #6173AD (97,115,173/101,120,180) => 13
### 14 seablue #4C81BF (76,129,191) => 14
### 15 skyblue #2F8BC9 (47,139,201) => 15
### 16 azurblue #1290D9 (18,144,217) => 16 or 0.01<MAF<0.05 or 0.2<INFO<0.4
### 17 lightazurblue #1396D8 (19,150,216) => 17
### 18 greenblue #15A6C1 (21,166,193) => 18
### 19 seaweedgreen #5EB17F (94,177,127) => 19
### 20 yellowgreen #86B833 (134,184,51) => 20
### 21 lightmossgreen #C5D220 (197,210,32) => 21
### 22 mossgreen #9FC228 (159,194,40) => 22 or MAF>0.20 or 0.6<INFO<0.8
### 23 lightgreen #78B113 (120,177,19) => 23/X
### 24 green #49A01D (73,160,29) => 24/Y
### 25 grey #595A5C (89,90,92) => 25/XY or MAF<0.01 or 0.0<INFO<0.2
### 26 lightgrey #A2A3A4 (162,163,164) => 26/MT
###
### ADDITIONAL COLORS
### 27 midgrey #D7D8D7
### 28 verylightgrey #ECECEC"
### 29 white #FFFFFF
### 30 black #000000
###----------------------------------------------------------------------------------------------
uithof_color = c("#FBB820","#F59D10","#E55738","#DB003F","#E35493","#D5267B",
"#CC0071","#A8448A","#9A3480","#8D5B9A","#705296","#686AA9",
"#6173AD","#4C81BF","#2F8BC9","#1290D9","#1396D8","#15A6C1",
"#5EB17F","#86B833","#C5D220","#9FC228","#78B113","#49A01D",
"#595A5C","#A2A3A4", "#D7D8D7", "#ECECEC", "#FFFFFF", "#000000")
uithof_color_legend = c("#FBB820", "#F59D10", "#E55738", "#DB003F", "#E35493",
"#D5267B", "#CC0071", "#A8448A", "#9A3480", "#8D5B9A",
"#705296", "#686AA9", "#6173AD", "#4C81BF", "#2F8BC9",
"#1290D9", "#1396D8", "#15A6C1", "#5EB17F", "#86B833",
"#C5D220", "#9FC228", "#78B113", "#49A01D", "#595A5C",
"#A2A3A4", "#D7D8D7", "#ECECEC", "#FFFFFF", "#000000")
### ----------------------------------------------------------------------------For the ERA-CVD ‘druggable-MI-targets’ project (grantnumber: 01KL1802) we performed two related RNA sequencing (RNAseq) experiments:
conventional (‘bulk’) RNAseq using RNA extracted from carotid plaque samples, n ± 700. As of Friday, March 18, 2022 all samples have been selected and RNA has been extracted; quality control (QC) was performed and we have a dataset of 635 samples.
single-cell RNAseq (scRNAseq) of at least n = 40 samples (20 females, 20 males). As of Friday, March 18, 2022 data is available of 40 samples (3 females, 15 males), we are extending sampling to get more female samples.
Plaque samples are derived from carotid endarterectomies as part of the Athero-Express Biobank Study which is an ongoing study in the UMC Utrecht.
Here we obtain data from the HDAC9 in plaques.
library(openxlsx)
gene_list_df <- read.xlsx(paste0(PROJECT_loc, "/targets/Genes.xlsx"), sheet = "Genes")
target_genes <- unlist(gene_list_df$Gene)
target_genes[1] "HDAC9" "TWIST1" "IL6" "IL1B"
First we will load the data:
Here we load the latest dataset from our Athero-Express bulk RNA experiment d.d. 2021-12-03 mapped to b37 and Ensembl 87.
These bulk RNAseq data are filtered and corrected:
# bulk RNAseq data
bulkRNA_counts_raw_qc_umicorr <- fread(paste0(AERNA_loc,"/raw_data_bulk/raw_counts_batch1till11_qc_umicorrected.txt"))
# batch information
bulkRNA_meta <- fread(paste0(AERNA_loc,"/raw_data_bulk/metadata_raw_counts_batch1till11.txt"))Quick peek at the counts and meta-data of the RNAseq experiment.
head(bulkRNA_counts_raw_qc_umicorr)
head(bulkRNA_meta)There are two small issues we need to address:
Inf valuescat("\nThere are a couple of samples with infinite gene counts.\n")
There are a couple of samples with infinite gene counts.
temp <- bulkRNA_counts_raw_qc_umicorr %>% mutate_if(is.numeric, as.integer)Warning: Problem with `mutate()` column `ae2341`.
ℹ `ae2341 = .Primitive("as.integer")(ae2341)`.
ℹ NAs introduced by coercion to integer range
Warning: Problem with `mutate()` column `ae3078`.
ℹ `ae3078 = .Primitive("as.integer")(ae3078)`.
ℹ NAs introduced by coercion to integer range
Warning: Problem with `mutate()` column `ae1422`.
ℹ `ae1422 = .Primitive("as.integer")(ae1422)`.
ℹ NAs introduced by coercion to integer range
Warning: Problem with `mutate()` column `ae2305`.
ℹ `ae2305 = .Primitive("as.integer")(ae2305)`.
ℹ NAs introduced by coercion to integer range
Warning: Problem with `mutate()` column `ae1256`.
ℹ `ae1256 = .Primitive("as.integer")(ae1256)`.
ℹ NAs introduced by coercion to integer range
Warning: Problem with `mutate()` column `ae411`.
ℹ `ae411 = .Primitive("as.integer")(ae411)`.
ℹ NAs introduced by coercion to integer range
Warning: Problem with `mutate()` column `ae1227`.
ℹ `ae1227 = .Primitive("as.integer")(ae1227)`.
ℹ NAs introduced by coercion to integer range
summary(bulkRNA_counts_raw_qc_umicorr$ae2341) Min. 1st Qu. Median Mean 3rd Qu. Max.
0 0 0 Inf 15 Inf
summary(bulkRNA_counts_raw_qc_umicorr$ae3078) Min. 1st Qu. Median Mean 3rd Qu. Max.
0 0 1 Inf 20 Inf
summary(bulkRNA_counts_raw_qc_umicorr$ae1422) Min. 1st Qu. Median Mean 3rd Qu. Max.
0 0 1 Inf 12 Inf
summary(bulkRNA_counts_raw_qc_umicorr$ae2305) Min. 1st Qu. Median Mean 3rd Qu. Max.
0 0 0 Inf 7 Inf
summary(bulkRNA_counts_raw_qc_umicorr$ae1256) Min. 1st Qu. Median Mean 3rd Qu. Max.
0 0 0 Inf 10 Inf
summary(bulkRNA_counts_raw_qc_umicorr$ae411) Min. 1st Qu. Median Mean 3rd Qu. Max.
0 0 0 Inf 15 Inf
summary(bulkRNA_counts_raw_qc_umicorr$ae1227) Min. 1st Qu. Median Mean 3rd Qu. Max.
0 0 1 Inf 11 Inf
cat("\nFixing the infinite gene counts.\n")
Fixing the infinite gene counts.
temp <- bulkRNA_counts_raw_qc_umicorr %>%
mutate(across( # For every column you want...
starts_with("ae"), # ...change all studynumber
~ case_when(
. == Inf ~ max(.[is.finite(.)]), # +Inf becomes the finite max.
. == -Inf ~ min(.[is.finite(.)]), # -Inf becomes the finite min.
TRUE ~ . # Other values stay the same.
)
)
)
library("devtools")
devtools::install_github("stephenturner/annotables")Using github PAT from envvar GITHUB_PAT
Downloading GitHub repo stephenturner/annotables@HEAD
These packages have more recent versions available.
It is recommended to update all of them.
Which would you like to update?
1: All
2: CRAN packages only
3: None
4: rlang (0.4.12 -> 1.0.2) [CRAN]
5: glue (1.6.0 -> 1.6.2) [CRAN]
6: fansi (1.0.0 -> 1.0.2) [CRAN]
7: crayon (1.4.2 -> 1.5.0) [CRAN]
8: cli (3.1.0 -> 3.2.0) [CRAN]
9: pillar (1.6.4 -> 1.7.0) [CRAN]
10: magrittr (2.0.1 -> 2.0.2) [CRAN]
3
checking for file ‘/private/var/folders/cj/1vxt4xb11m1cx7wn020f8hww0000gn/T/Rtmp0fcsU4/remotes151b27240de45/stephenturner-annotables-631423c/DESCRIPTION’ ...
✓ checking for file ‘/private/var/folders/cj/1vxt4xb11m1cx7wn020f8hww0000gn/T/Rtmp0fcsU4/remotes151b27240de45/stephenturner-annotables-631423c/DESCRIPTION’ (435ms)
─ preparing ‘annotables’:
checking DESCRIPTION meta-information ...
✓ checking DESCRIPTION meta-information
─ checking for LF line-endings in source and make files and shell scripts
─ checking for empty or unneeded directories
NB: this package now depends on R (>= 3.5.0)
WARNING: Added dependency on R >= 3.5.0 because serialized objects in
serialize/load version 3 cannot be read in older versions of R.
File(s) containing such objects:
‘annotables/data/bdgp6.rda’ ‘annotables/data/bdgp6_tx2gene.rda’
‘annotables/data/ensembl_version.rda’ ‘annotables/data/galgal5.rda’
‘annotables/data/galgal5_tx2gene.rda’ ‘annotables/data/grch37.rda’
‘annotables/data/grch37_tx2gene.rda’ ‘annotables/data/grch38.rda’
‘annotables/data/grch38_tx2gene.rda’ ‘annotables/data/grcm38.rda’
‘annotables/data/grcm38_tx2gene.rda’ ‘annotables/data/mmul801.rda’
‘annotables/data/mmul801_tx2gene.rda’ ‘annotables/data/rnor6.rda’
‘annotables/data/rnor6_tx2gene.rda’ ‘annotables/data/wbcel235.rda’
‘annotables/data/wbcel235_tx2gene.rda’
─ building ‘annotables_0.1.91.tar.gz’
Installing package into '/Users/swvanderlaan/Library/R/x86_64/4.1/library'
(as 'lib' is unspecified)
* installing *source* package ‘annotables’ ...
** using staged installation
** R
** data
*** moving datasets to lazyload DB
** inst
** byte-compile and prepare package for lazy loading
** help
*** installing help indices
** building package indices
** testing if installed package can be loaded from temporary location
** testing if installed package can be loaded from final location
** testing if installed package keeps a record of temporary installation path
* DONE (annotables)
library(dplyr)
library(annotables)
# Columns of interest
# entrez
# symbol
# chr
# start
# end
# strand
# biotype
# description
cat("\nChecking existence of duplicate ENSEMBL IDs - there shouldn't be any.\n")
Checking existence of duplicate ENSEMBL IDs - there shouldn't be any.
id <- temp$ENSEMBL_gene_ID
id[ id %in% id[duplicated(id)] ]character(0)
rm(id)cat("\nAnnotating with b37.\n")
Annotating with b37.
bulkRNA_counts <- temp %>%
# arrange(p.adjusted) %>%
# head(20) %>%
inner_join(grch37, by=c("ENSEMBL_gene_ID"="ensgene")) %>%
# select(gene, estimate, p.adjusted, symbol, description) %>%
relocate(entrez, symbol, chr, start, end, strand, biotype, description,
.before = ae1618) %>%
dplyr::filter(duplicated(ENSEMBL_gene_ID) == FALSE)
head(bulkRNA_counts)
id <- bulkRNA_counts$ENSEMBL_gene_ID
id[ id %in% id[duplicated(id)] ]character(0)
Loading Athero-Express clinical data that we previously saved in an RDS file.
AEDB.CEA <- readRDS(file = paste0(OUT_loc, "/20220317.HDAC9.AEDB.CEA.RDS"))We will fix the STUDY_NUMBER to match the bulkRNAseq
data.
AEDB.CEA$STUDY_NUMBER <- paste0("ae", AEDB.CEA$STUDY_NUMBER)
head(AEDB.CEA$STUDY_NUMBER)[1] "ae1" "ae2" "ae3" "ae4" "ae5" "ae6"
We have collected the clinical data, Athero-Express Biobank Study
AEDB and, the UMI-corrected, filtered bulk RNAseq data,
bulkRNA_counts and its meta-data,
bulkRNA-meta.
Here we will clean up the data and create a
SummarizedExperiment() object for downstream analyses anad
visualizations.
AEDB.CEA.sampleList <- AEDB.CEA$STUDY_NUMBER
# first 9 columns
# ENSEMBL_gene_ID
# entrez
# symbol
# chr
# start
# end
# strand
# biotype
# description
# match up with meta data of RNAseq experiment
bulkRNA_countsFilt <- bulkRNA_counts %>%
drop_na(chr) %>% # remove rows that have no information of start, end, chromosome and/or strand
dplyr::select(1:9, one_of(sort(as.character(AEDB.CEA.sampleList)))) # select gene expression of only patients in RNA-seq AE df, sort in same order as metadata study_number
dim(bulkRNA_countsFilt)[1] 59851 620
study_samples_bulkNEW <- colnames(bulkRNA_counts[, -(1:9)])
length(study_samples_bulkNEW)[1] 656
study_samples_AEDBCEA <- c(AEDB.CEA$STUDY_NUMBER)
setdif_samples_NEWvsAEDBCEA <- setdiff(study_samples_bulkNEW, study_samples_AEDBCEA)
setdif_samples_AEDBCEAvsNEW <- setdiff(study_samples_AEDBCEA, study_samples_bulkNEW)
AEDB_filt <- AEDB.CEA[AEDB.CEA$STUDY_NUMBER %in% setdif_samples_NEWvsAEDBCEA,]
table(AEDB_filt$Artery_summary, AEDB_filt$Gender)
female male
No artery known (yet), no surgery (patient ill, died, exited study), re-numbered to AAA 0 0
carotid (left & right) 0 0
femoral/iliac (left, right or both sides) 0 0
other carotid arteries (common, external) 0 0
carotid bypass and injury (left, right or both sides) 0 0
aneurysmata (carotid & femoral) 0 0
aorta 0 0
other arteries (renal, popliteal, vertebral) 0 0
femoral bypass, angioseal and injury (left, right or both sides) 0 0
# Cut up bulkRNA_countsFilt into 'assay' and 'ranges' part
counts <- as.data.frame(bulkRNA_countsFilt[,-(1:9)]) ## assay part
counts <- counts %>% mutate_if(is.numeric, as.integer)
rownames(counts) <- bulkRNA_countsFilt$ENSEMBL_gene_ID ## assign rownames
id <- bulkRNA_countsFilt$ENSEMBL_gene_ID
id[ id %in% id[duplicated(id)] ]character(0)
bulkRNA_rowRanges <- GRanges(bulkRNA_countsFilt$chr, ## construct a GRanges object containing 4 columns (seqnames, ranges, strand, seqinfo) plus a metadata colum (feature_id): this will be the 'rowRanges' bit
IRanges(bulkRNA_countsFilt$start, bulkRNA_countsFilt$end),
strand = bulkRNA_countsFilt$strand,
feature_id = bulkRNA_countsFilt$ENSEMBL_gene_ID) #, df$pid)
names(bulkRNA_rowRanges) <- bulkRNA_rowRanges$feature_id
# ?org.Hs.eg.db
# ?AnnotationDb
bulkRNA_rowRanges$symbol <- mapIds(org.Hs.eg.db,
keys = bulkRNA_rowRanges$feature_id,
column = "SYMBOL",
keytype = "ENSEMBL",
multiVals = "first")'select()' returned 1:many mapping between keys and columns
# Reference: https://shiring.github.io/genome/2016/10/23/AnnotationDbi
# gene dataframe for EnsDb.Hsapiens.v86
gene_dataframe_EnsDb <- ensembldb::select(EnsDb.Hsapiens.v86, keys = bulkRNA_rowRanges$feature_id,
columns = c("ENTREZID", "SYMBOL", "GENEBIOTYPE"), keytype = "GENEID")
colnames(gene_dataframe_EnsDb) <- c("Ensembl", "Entrez", "HGNC", "GENEBIOTYPE")
colnames(gene_dataframe_EnsDb) <- paste(colnames(gene_dataframe_EnsDb), "EnsDb86", sep = "_")
head(gene_dataframe_EnsDb)
bulkRNA_rowRanges$GENEBIOTYPE_EnsDb86 <- gene_dataframe_EnsDb$GENEBIOTYPE_EnsDb86[match(bulkRNA_rowRanges$feature_id, gene_dataframe_EnsDb$Ensembl_EnsDb86)]
bulkRNA_rowRangesGRanges object with 59851 ranges and 3 metadata columns:
seqnames ranges strand | feature_id symbol GENEBIOTYPE_EnsDb86
<Rle> <IRanges> <Rle> | <character> <character> <character>
ENSG00000000003 X 100627108-100639991 - | ENSG00000000003 TSPAN6 protein_coding
ENSG00000000419 20 50934867-50959140 - | ENSG00000000419 DPM1 protein_coding
ENSG00000000457 1 169849631-169894267 - | ENSG00000000457 SCYL3 protein_coding
ENSG00000000460 1 169662007-169854080 + | ENSG00000000460 C1orf112 protein_coding
ENSG00000000938 1 27612064-27635185 - | ENSG00000000938 FGR protein_coding
... ... ... ... . ... ... ...
ENSG00000248205 5 17502043-17502363 - | ENSG00000248205 <NA> processed_pseudogene
ENSG00000259098 15 22258138-22258848 + | ENSG00000259098 <NA> processed_pseudogene
ENSG00000267090 19 38385522-38386759 + | ENSG00000267090 <NA> antisense
ENSG00000279119 17 38727833-38728198 - | ENSG00000279119 <NA> protein_coding
ENSG00000271242 13 51195881-51196081 + | ENSG00000271242 PRELID3BP2 processed_pseudogene
-------
seqinfo: 338 sequences from an unspecified genome; no seqlengths
# merging the two dataframes by HGNC
# bulkRNA_rowRangesHg19Ensemblb86 <- GRanges(merge(bulkRNA_rowRanges, gene_dataframe_EnsDb, by.x = "feature_id", by.y = "Ensembl_EnsDb86", sort = FALSE, all.x = TRUE))
# names(bulkRNA_rowRangesHg19Ensemblb86) <- bulkRNA_rowRangesHg19Ensemblb86$feature_id
# bulkRNA_rowRangesHg19Ensemblb86
# temp <- as.data.frame(table(bulkRNA_rowRanges$GENEBIOTYPE_EnsDb86))
# colnames(temp) <- c("GeneBiotype", "Count")
#
# ggpubr::ggbarplot(temp, x = "GeneBiotype", y = "Count",
# color = "GeneBiotype", fill = "GeneBiotype",
# xlab = "gene type") +
# theme(axis.text.x = element_text(angle = 45))
# rm(temp)# match up with meta data of RNAseq experiment
bulkRNA_meta %<>%
dplyr::filter(study_number %in% AEDB.CEA.sampleList) # select gene expression of only patients in RNA-seq AE df, sort in same order as metadata study_number
# combine meta data from experiment with clinical data
bulkRNA_meta_clin <- merge(bulkRNA_meta, AEDB.CEA, by.x = "study_number", by.y = "STUDY_NUMBER",
sort = FALSE, all.x = TRUE)
bulkRNA_meta_clin %<>%
# mutate(macrophages = factor(macrophages, levels = c("no staining", "minor staining", "moderate staining", "heavy staining"))) %>%
# mutate(smc = factor(smc, levels = c("no staining", "minor staining", "moderate staining", "heavy staining"))) %>%
# mutate(calcification = factor(calcification, levels = c("no staining", "minor staining", "moderate staining", "heavy staining"))) %>%
# mutate(collagen = factor(collagen, levels = c("no staining", "minor staining", "moderate staining", "heavy staining"))) %>%
# mutate(fat = factor(fat, levels = c("no fat", "< 40% fat", "> 40% fat"))) %>%
mutate(study_number_row = study_number) %>%
as.data.frame() %>%
column_to_rownames("study_number_row")
head(bulkRNA_meta_clin)dim(bulkRNA_meta_clin)[1] 650 1714
We make a SummarizedExperiment for the RNAseq data. We
make sure to only include the samples we need based on informed consent
and we include only the requested variables.
First, we define the variables we need.
# Baseline table variables
basetable_vars = c("Hospital", "ORyear", "Artery_summary",
"Age", "Gender",
# "TC_finalCU", "LDL_finalCU", "HDL_finalCU", "TG_finalCU",
"TC_final", "LDL_final", "HDL_final", "TG_final",
# "hsCRP_plasma",
"systolic", "diastoli", "GFR_MDRD", "BMI",
"KDOQI", "BMI_WHO",
"SmokerStatus", "AlcoholUse",
"DiabetesStatus",
"Hypertension.selfreport", "Hypertension.selfreportdrug", "Hypertension.composite", "Hypertension.drugs",
"Med.anticoagulants", "Med.all.antiplatelet", "Med.Statin.LLD",
"Stroke_Dx", "sympt", "Symptoms.5G", "AsymptSympt", "AsymptSympt2G",
"Symptoms.Update2G", "Symptoms.Update3G",
"restenos", "stenose",
"CAD_history", "PAOD", "Peripheral.interv",
"EP_composite", "EP_composite_time", "EP_major", "EP_major_time",
"MAC_rankNorm", "SMC_rankNorm", "Macrophages.bin", "SMC.bin",
"Neutrophils_rankNorm", "MastCells_rankNorm",
"IPH.bin", "VesselDensity_rankNorm",
"Calc.bin", "Collagen.bin",
"Fat.bin_10", "Fat.bin_40",
"OverallPlaquePhenotype", "Plaque_Vulnerability_Index")
basetable_bin = c("Gender", "Artery_summary",
"KDOQI", "BMI_WHO",
"SmokerStatus", "AlcoholUse",
"DiabetesStatus",
"Hypertension.selfreport", "Hypertension.selfreportdrug", "Hypertension.composite", "Hypertension.drugs",
"Med.anticoagulants", "Med.all.antiplatelet", "Med.Statin.LLD",
"Stroke_Dx", "sympt", "Symptoms.5G", "AsymptSympt", "AsymptSympt2G",
"Symptoms.Update2G", "Symptoms.Update3G",
"restenos", "stenose",
"CAD_history", "PAOD", "Peripheral.interv",
"EP_composite", "Macrophages.bin", "SMC.bin",
"IPH.bin",
"Calc.bin", "Collagen.bin",
"Fat.bin_10", "Fat.bin_40",
"OverallPlaquePhenotype", "Plaque_Vulnerability_Index")
# basetable_bin
basetable_con = basetable_vars[!basetable_vars %in% basetable_bin]
# basetable_conNext, we are constructing the SummarizedExperiment.
cat("* loading data ...\n")* loading data ...
# this is all the data passing RNAseq quality control and UMI-corrected
# - includes 656 patients
# - after filtering on informed consent and artery type, the end sample size should be 611
# - after filtering on 'no commercial business' based on informed consent, there are fewer samples: 608
dim(bulkRNA_countsFilt)[1] 59851 620
dim(counts)[1] 59851 611
cat("\n* making a SummarizedExperiment ...\n")
* making a SummarizedExperiment ...
cat(" > getting counts\n") > getting counts
head(counts)head(bulkRNA_countsFilt)
cat(" > meta data\n") > meta data
temp_coldat <- data.frame(STUDY_NUMBER = names(bulkRNA_countsFilt[,10:620]),
SampleType = "plaque", RNAseqType = "3' RNAseq",
row.names = names(bulkRNA_countsFilt[,10:620]))
cat(" > clinical data\n") > clinical data
# bulkRNA_meta_clin_COMMERCIAL <- subset(bulkRNA_meta_clin, select = c("study_number", basetable_vars))
bulkRNA_meta_clin_ACADEMIC <- subset(bulkRNA_meta_clin, select = c("study_number", basetable_vars))
# temp_coldat_clin <- merge(temp_coldat, bulkRNA_meta_clin_COMMERCIAL, by.x = "STUDY_NUMBER", by.y = "study_number", sort = FALSE, all.x = TRUE)
temp_coldat_clin <- merge(temp_coldat, bulkRNA_meta_clin_ACADEMIC, by.x = "STUDY_NUMBER", by.y = "study_number", sort = FALSE, all.x = TRUE)
rownames(temp_coldat_clin) <- temp_coldat_clin$STUDY_NUMBER
dim(temp_coldat_clin)[1] 611 58
cat(" > construction of the SE\n") > construction of the SE
(AERNASE <- SummarizedExperiment(assays = list(counts = as.matrix(counts)),
colData = temp_coldat_clin,
rowRanges = bulkRNA_rowRanges,
metadata = "Athero-Express Biobank Study bulk RNA sequencing. Sample type: carotid plaques. Technology: CEL2-seq adapted for bulk RNA sequencing, thus 3'-focused. UMI-corrected"))class: RangedSummarizedExperiment
dim: 59851 611
metadata(1): ''
assays(1): counts
rownames(59851): ENSG00000000003 ENSG00000000419 ... ENSG00000279119 ENSG00000271242
rowData names(3): feature_id symbol GENEBIOTYPE_EnsDb86
colnames(611): ae1 ae1026 ... ae998 ae999
colData names(58): STUDY_NUMBER SampleType ... OverallPlaquePhenotype Plaque_Vulnerability_Index
cat("\n* removing intermediate files ...\n")
* removing intermediate files ...
rm(temp_coldat, temp_coldat_clin)Do the study numbers correspond between metadata and expression data?
## check whether rownames metadata and colnames counts are identical
all(colnames(AERNASE) == colnames(counts))[1] TRUE
So, now we have raw counts for all patients included in the bulk RNAseq data, with all clinical data annotated to them.
Some of the patients might be missing in certain variables:
# We know that some of the patients of the RNAseq is not included in some variables
which(is.na(AERNASE$Gender))
missing_values <- which(is.na(AERNASE$Gender))
missing_valuesNo need to remove missing samples based on a variable, since we will make a DESeq2 object using an empty model.
(AERNASE <- AERNASE[,])
# (AERNASE <- AERNASE[, -missing_values])
# (se <- se[, se$sex == "male"])cat("====================================================================================================")====================================================================================================
cat("SELECTION THE SHIZZLE")SELECTION THE SHIZZLE
AERNASEClinData <- as.tibble(colData(AERNASE))Warning: `as.tibble()` was deprecated in tibble 2.0.0.
Please use `as_tibble()` instead.
The signature and semantics have changed, see `?as_tibble`.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
cat("- sanity checking PRIOR to selection")- sanity checking PRIOR to selection
library(data.table)
require(labelled)Loading required package: labelled
ae.gender <- to_factor(AERNASEClinData$Gender)
ae.hospital <- to_factor(AERNASEClinData$Hospital)
table(ae.gender, ae.hospital, dnn = c("Sex", "Hospital"), useNA = "ifany") Hospital
Sex St. Antonius, Nieuwegein UMC Utrecht
female 94 54
male 255 208
ae.artery <- to_factor(AERNASEClinData$Artery_summary)
table(ae.artery, ae.gender, dnn = c("Sex", "Artery"), useNA = "ifany") Artery
Sex female male
No artery known (yet), no surgery (patient ill, died, exited study), re-numbered to AAA 0 0
carotid (left & right) 148 461
femoral/iliac (left, right or both sides) 0 0
other carotid arteries (common, external) 0 2
carotid bypass and injury (left, right or both sides) 0 0
aneurysmata (carotid & femoral) 0 0
aorta 0 0
other arteries (renal, popliteal, vertebral) 0 0
femoral bypass, angioseal and injury (left, right or both sides) 0 0
rm(ae.gender, ae.hospital, ae.artery)
# AERNASEClinData[1:10, 1:10]
dim(AERNASEClinData)[1] 611 58
# DT::datatable(AERNASEClinData)Showing the baseline table for the scRNAseq data in 39 CEA patients with informed consent.
cat("===========================================================================================")===========================================================================================
cat("CREATE BASELINE TABLE")CREATE BASELINE TABLE
# Create baseline tables
# http://rstudio-pubs-static.s3.amazonaws.com/13321_da314633db924dc78986a850813a50d5.html
AERNASEClinData.CEA.tableOne = print(CreateTableOne(vars = basetable_vars,
# factorVars = basetable_bin,
# strata = "Gender",
data = AERNASEClinData, includeNA = TRUE),
nonnormal = c(),
quote = FALSE, showAllLevels = TRUE,
format = "p",
contDigits = 3)[,1:2]
level Overall
n 611
Hospital (%) St. Antonius, Nieuwegein 57.1
UMC Utrecht 42.9
ORyear (%) 2002 5.2
2003 10.0
2004 10.6
2005 13.1
2006 13.3
2007 10.5
2008 10.1
2009 11.0
2010 5.6
2011 5.1
2012 3.6
2013 0.8
2014 0.5
2015 0.5
2016 0.2
2017 0.0
2018 0.0
2019 0.0
2020 0.0
2021 0.0
2022 0.0
Artery_summary (%) No artery known (yet), no surgery (patient ill, died, exited study), re-numbered to AAA 0.0
carotid (left & right) 99.7
femoral/iliac (left, right or both sides) 0.0
other carotid arteries (common, external) 0.3
carotid bypass and injury (left, right or both sides) 0.0
aneurysmata (carotid & femoral) 0.0
aorta 0.0
other arteries (renal, popliteal, vertebral) 0.0
femoral bypass, angioseal and injury (left, right or both sides) 0.0
Age (mean (SD)) 34.484 (8.897)
Gender (%) female 24.2
male 75.8
TC_final (mean (SD)) 4.663 (1.259)
LDL_final (mean (SD)) 2.774 (1.046)
HDL_final (mean (SD)) 1.147 (0.375)
TG_final (mean (SD)) 1.605 (0.942)
systolic (mean (SD)) 65.356 (24.793)
diastoli (mean (SD)) 43.364 (13.409)
GFR_MDRD (mean (SD)) 1326.275 (661.445)
BMI (mean (SD)) 667.110 (299.088)
KDOQI (%) Normal kidney function 18.2
CKD 2 (Mild) 55.8
CKD 3 (Moderate) 23.2
CKD 4 (Severe) 1.5
CKD 5 (Failure) 0.0
<NA> 1.3
BMI_WHO (%) Underweight 0.8
Normal 33.2
Overweight 46.2
Obese 14.7
<NA> 5.1
SmokerStatus (%) Current smoker 36.0
Ex-smoker 44.0
Never smoked 16.2
<NA> 3.8
AlcoholUse (%) Yes 34.0
<NA> 66.0
DiabetesStatus (%) Diabetes 78.4
<NA> 21.6
Hypertension.selfreport (%) no 27.3
yes 70.5
<NA> 2.1
Hypertension.selfreportdrug (%) no 33.7
yes 63.8
<NA> 2.5
Hypertension.composite (%) no 13.3
yes 86.7
Hypertension.drugs (%) no 22.9
yes 76.9
<NA> 0.2
Med.anticoagulants (%) no 87.7
yes 12.1
<NA> 0.2
Med.all.antiplatelet (%) no 10.5
yes 89.4
<NA> 0.2
Med.Statin.LLD (%) no 24.5
yes 75.3
<NA> 0.2
Stroke_Dx (%) No stroke diagnosed 75.5
Stroke diagnosed 18.0
<NA> 6.5
sympt (%) Asymptomatic 13.1
TIA 41.2
minor stroke 15.4
Major stroke 9.3
Amaurosis fugax 15.9
Four vessel disease 0.0
Vertebrobasilary TIA 0.2
Retinal infarction 1.5
Symptomatic, but aspecific symtoms 2.6
Contralateral symptomatic occlusion 0.5
retinal infarction 0.2
armclaudication due to occlusion subclavian artery, CEA needed for bypass 0.0
retinal infarction + TIAs 0.0
Ocular ischemic syndrome 0.2
ischemisch glaucoom 0.0
subclavian steal syndrome 0.0
TGA 0.0
Symptoms.5G (%) Ocular 9.5
Other 19.0
Retinal infarction 1.6
Stroke 56.6
TIA 13.3
AsymptSympt (%) Ocular and others 30.1
Symptomatic 69.9
AsymptSympt2G (%) Symptomatic 100.0
Symptoms.Update2G (%) Symptomatic 75.5
<NA> 24.5
Symptoms.Update3G (%) Symptomatic 75.5
<NA> 24.5
restenos (%) de novo 95.9
restenosis 1.8
stenose bij angioseal na PTCA 0.0
<NA> 2.3
stenose (%) 0-49% 0.3
50-70% 6.2
70-90% 44.0
90-99% 44.8
100% (Occlusion) 0.8
NA 0.0
50-99% 0.2
70-99% 0.0
99 0.0
<NA> 3.6
CAD_history (%) No history CAD 66.6
History CAD 33.4
PAOD (%) no 79.5
yes 20.5
Peripheral.interv (%) no 84.8
yes 15.2
EP_composite (%) No composite endpoints 75.0
Composite endpoints 24.5
<NA> 0.5
EP_composite_time (mean (SD)) 2.655 (1.143)
EP_major (%) No major events (endpoints) 86.3
Major events (endpoints) 13.3
<NA> 0.5
EP_major_time (mean (SD)) 2.845 (1.021)
MAC_rankNorm (mean (SD)) 0.276 (0.961)
SMC_rankNorm (mean (SD)) -0.041 (0.930)
Macrophages.bin (%) no/minor 42.7
moderate/heavy 55.5
<NA> 1.8
SMC.bin (%) no/minor 31.6
moderate/heavy 66.6
<NA> 1.8
Neutrophils_rankNorm (mean (SD)) 0.256 (1.020)
MastCells_rankNorm (mean (SD)) -0.021 (1.025)
IPH.bin (%) no 37.6
yes 60.9
<NA> 1.5
VesselDensity_rankNorm (mean (SD)) 0.139 (0.948)
Calc.bin (%) no/minor 46.3
moderate/heavy 52.4
<NA> 1.3
Collagen.bin (%) no/minor 19.1
moderate/heavy 79.4
<NA> 1.5
Fat.bin_10 (%) <10% 23.4
>10% 75.3
<NA> 1.3
Fat.bin_40 (%) <40% 68.9
>40% 29.8
<NA> 1.3
OverallPlaquePhenotype (%) atheromatous 37.3
fibroatheromatous 31.3
fibrous 0.0
<NA> 31.4
Plaque_Vulnerability_Index (%) 0 7.4
1 16.5
2 25.5
3 33.1
4 11.9
5 5.6
Writing the baseline table to Excel format.
# Write basetable
require(openxlsx)
# write.xlsx(file = paste0(BASELINE_loc, "/",Today,".",PROJECTNAME,".AERNA.CEA.608pts.after_qc.IC_commercial.BaselineTable..xlsx"),
# format(AERNASEClinData.CEA.tableOne, digits = 5, scientific = FALSE) ,
# rowNames = TRUE, colNames = TRUE, overwrite = TRUE)
#
write.xlsx(file = paste0(BASELINE_loc, "/",Today,".",PROJECTNAME,".AERNA.CEA.611pts.after_qc.IC_academic.BaselineTable..xlsx"),
format(AERNASEClinData.CEA.tableOne, digits = 5, scientific = FALSE) ,
rowNames = TRUE, colNames = TRUE, overwrite = TRUE)From here we can analyze whether specific genes differ between groups, or do this for the entire gene set as part of DE analysis, and then select our genes of interest. Let’s start with the former.
The dds raw counts need normalization and log transformation first.
AERNAdds <- DESeqDataSet(AERNASE, design = ~ 1)
# Determine the size factors to use for normalization
AERNAdds <- estimateSizeFactors(AERNAdds)
# sizeFactors(AERNAdds)
# Extract the normalized counts
normalized_counts <- counts(AERNAdds, normalized = TRUE)
# head(normalized_counts)
# Log transform counts for QC
AERNAvsd <- vst(AERNAdds, blind = TRUE)
# There is a message stating the following.
#
# -- note: fitType='parametric', but the dispersion trend was not well captured by the
# function: y = a/x + b, and a local regression fit was automatically substituted.
# specify fitType='local' or 'mean' to avoid this message next time.
#
# No action is required.
#
# For more information check: https://www.biostars.org/p/119115/From here, extract the gene expression values, plus gene identifier, annotate with gene symbol, and select the genes of our interest.
expression_data <- assay(AERNAvsd)
# extract expression values from vsd, including ensembl names
expression_data <- as_tibble(data.frame(gene_ensembl = rowRanges(AERNAvsd)$feature_id, assay(AERNAvsd))) %>%
mutate_at(vars(c("gene_ensembl")), list(as.character)) ## gene_ensembl needs to be character for annotation to work
# annotations
# gene symbol - via org.Hs.eg.db
# columns(org.Hs.eg.db)
expression_data$symbol <- mapIds(org.Hs.eg.db,
keys = expression_data$gene_ensembl,
column = "SYMBOL",
keytype = "ENSEMBL",
multiVals = "first")'select()' returned 1:many mapping between keys and columns
# tidy and subset
expression_data_sel <- expression_data %>%
dplyr::select(gene_ensembl, symbol, everything()) %>%
# filter(symbol == "APOE" | symbol == "TRIB3") %>% # filter APOE and TRIB3
dplyr::filter(symbol %in% target_genes)
head(expression_data_sel)
# tidy and subset non-selected genes
set.seed(141619)
expression_data_sample <- expression_data %>%
dplyr::select(gene_ensembl, symbol, everything()) %>%
sample_n(1000) %>%
unite(symbol_ensembl, symbol, gene_ensembl, sep = "_", remove = FALSE)
expression_data_sample_mean <- expression_data_sample %>%
select_if(is.numeric) %>%
colMeans() %>%
as_tibble(rownames = "study_number") %>%
dplyr::rename(expression_value_sample = value)Furthermore, the expression_data_sel df was gathered into a long form
df for annotation with symptoms variables from the
vsd object, and later visualization and statistics.
# gather expression_data_sel df into long df form for annotation, plotting and statistics
expression_long <-
gather(expression_data_sel, key = "study_number", value = "expression_value", -c(gene_ensembl, symbol))
# old school way
# Annotate with smoking variables
# sample_ids <- expression_long$study_number
# mm <- match(expression_long$study_number, rownames(colData(vsd)))
#
# ## Add traits to df
# ## Binary traits
# expression_long$sex <- colData(vsd)$sex[mm]
# expression_long$testosterone <- colData(vsd)$testosterone[mm]
# expression_long$t_e2_ratio <- colData(vsd)$t_e2_ratio[mm]
# new school way
plaque_phenotypes_cat <- c("Macrophages.bin",
"SMC.bin",
"Calc.bin",
"Collagen.bin",
"Fat.bin_10",
# "Fat.bin_40",
"IPH.bin")
plaque_phenotypes_num <- c("MAC_rankNorm", #"macmean0",
"SMC_rankNorm", #"smcmean0",
# "MastCells_rankNorm", #"mast_cells_plaque",
# "Neutrophils_rankNorm", #"neutrophils",
"VesselDensity_rankNorm") #"vessel_density")
expression_long <- expression_long %>%
left_join(bulkRNA_meta_clin %>% dplyr::select(study_number,
plaque_phenotypes_cat,
plaque_phenotypes_num,
epmajor.3years, epmajor.30days,
AsymptSympt2G,
Gender, Hospital),
by = "study_number") %>%
mutate(epmajor_3years_yn = str_replace_all(epmajor.3years, c("Excluded" = "yes", "Included" = "no"))) %>%
mutate(epmajor.30days_yn = str_replace_all(epmajor.30days, c("Excluded" = "yes", "Included" = "no")))Note: Using an external vector in selections is ambiguous.
ℹ Use `all_of(plaque_phenotypes_cat)` instead of `plaque_phenotypes_cat` to silence this message.
ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
This message is displayed once per session.
Note: Using an external vector in selections is ambiguous.
ℹ Use `all_of(plaque_phenotypes_num)` instead of `plaque_phenotypes_num` to silence this message.
ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
This message is displayed once per session.
head(expression_long)
# expression_long %>%
# write_tsv("genes_interest_expression.txt")In case some genes are not available in our data we could filter them here.
target_genes[1] "HDAC9" "TWIST1" "IL6" "IL1B"
This code is just an example to filter the list from genes that are not in the data.
# target_genes_rm <- c("AC011294.3", "C6orf195", "C9orf53", "AL137026.1", "DUPD1", "RP11-145E5.5", "PVRL2",
# "LINC00841", "LOC100130539")
#
# temp = target_genes[!target_genes %in% target_genes_rm]
#
# target_genes_qc <- c(temp)
target_genes_qc <- target_genes
# for debug
# target_genes_qc_replace <- c("LINC01600", "DUSP27", "NECTIN2", "C10orf142", "LINC02881")Figure 1: Expression of genes of interest: boxplots
# Make directory for plots
ifelse(!dir.exists(file.path(QC_loc, "/Boxplots")),
dir.create(file.path(QC_loc, "/Boxplots")),
FALSE)[1] FALSE
BOX_loc = paste0(QC_loc,"/Boxplots")
for(GENE in target_genes_qc){
cat(paste0("Plotting expression for ", GENE,".\n"))
temp <- subset(expression_long, symbol == GENE)
compare_means(expression_value ~ Gender, data = temp)
p1 <- ggpubr::ggboxplot(temp,
x = "Gender",
y = "expression_value",
color = "Gender",
palette = "npg",
add = "jitter",
ylab = paste0("normalized expression ", GENE,"" ),
repel = TRUE
) + stat_compare_means()
#print(p1)
cat(paste0("Saving image for ", GENE,".\n"))
ggsave(filename = paste0(BOX_loc, "/", Today, ".",GENE,".expression_vs_gender.png"), plot = last_plot())
ggsave(filename = paste0(BOX_loc, "/", Today, ".",GENE,".expression_vs_gender.pdf"), plot = last_plot())
rm(temp, p1 )
}Plotting expression for HDAC9.
Saving image for HDAC9.
Saving 7 x 7 in image
Saving 7 x 7 in image
Plotting expression for TWIST1.
Saving image for TWIST1.
Saving 7 x 7 in image
Saving 7 x 7 in image
Plotting expression for IL6.
Saving image for IL6.
Saving 7 x 7 in image
Saving 7 x 7 in image
Plotting expression for IL1B.
Saving image for IL1B.
Saving 7 x 7 in image
Saving 7 x 7 in image
Figure 2A: Expression of genes of interest: histograms
# Make directory for plots
ifelse(!dir.exists(file.path(QC_loc, "/Histograms")),
dir.create(file.path(QC_loc, "/Histograms")),
FALSE)[1] FALSE
HISTOGRAM_loc = paste0(QC_loc,"/Histograms")
for(GENE in target_genes_qc){
# cat(paste0("Plotting expression for ", GENE,".\n"))
temp <- subset(expression_long, symbol == GENE)
p1 <- ggpubr::gghistogram(temp,
x = "expression_value",
y = "..count..",
color = "Gender", fill = "Gender",
palette = "npg",
add = "median",
ylab = paste0("normalized expression ", GENE,"" )
)
# print(p1)
cat(paste0("Saving image for ", GENE,".\n"))
ggsave(filename = paste0(HISTOGRAM_loc, "/", Today, ".",GENE,".distribution.png"), plot = last_plot())
# ggsave(filename = paste0(HISTOGRAM_loc, "/", Today, ".",GENE,".distribution.pdf"), plot = last_plot())
rm(temp, p1 )
}Saving image for HDAC9.
Saving 7 x 7 in image
Saving image for TWIST1.
Saving 7 x 7 in image
Saving image for IL6.
Saving 7 x 7 in image
Saving image for IL1B.
Saving 7 x 7 in image
Figure 2B: Expression of genes of interest: density plots
# Make directory for plots
ifelse(!dir.exists(file.path(QC_loc, "/Density")),
dir.create(file.path(QC_loc, "/Density")),
FALSE)[1] FALSE
DENSITY_loc = paste0(QC_loc,"/Density")
for(GENE in target_genes_qc){
# cat(paste0("Plotting expression for ", GENE,".\n"))
temp <- subset(expression_long, symbol == GENE)
p1 <- ggpubr::gghistogram(temp,
x = "expression_value",
y = "..density..",
color = "Gender", fill = "Gender",
palette = "npg",
add = "median",
ylab = paste0("normalized expression ", GENE,"" )
)
# print(p1)
cat(paste0("Saving image for ", GENE,".\n"))
ggsave(filename = paste0(DENSITY_loc, "/", Today, ".",GENE,".density.png"), plot = last_plot())
# ggsave(filename = paste0(DENSITY_loc, "/", Today, ".",GENE,".density.pdf"), plot = last_plot())
rm(temp, p1 )
}Saving image for HDAC9.
Saving 7 x 7 in image
Saving image for TWIST1.
Saving 7 x 7 in image
Saving image for IL6.
Saving 7 x 7 in image
Saving image for IL1B.
Saving 7 x 7 in image
Figure 3: comparing expression of genes of interest to mean expression of a sample of 1,000 random genes
expression_wide <- expression_long %>%
dplyr::select(-gene_ensembl) %>%
spread(key = symbol, value = expression_value)# the next 3 lines of code gave an error when selecting for genes_interest, since one of the genes of interest is missing: FGF3 is not in the data set. So, we need to select for the other 15 genes.
# genes_interest <- genes_interest[genes_interest$Symbol %in% unique(expression_long$symbol),]
# target_genes_qc
expression_wide2 <- expression_wide %>%
left_join(expression_data_sample_mean, by = "study_number") %>%
dplyr::select(study_number, target_genes_qc, expression_value_sample)Note: Using an external vector in selections is ambiguous.
ℹ Use `all_of(target_genes_qc)` instead of `target_genes_qc` to silence this message.
ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
This message is displayed once per session.
expression_long2 <- expression_wide2 %>%
gather(gene, expression_value, -study_number) %>%
mutate(gene = str_replace_all(gene, c("expression_value_sample" = "Random genes"))) #%>%
# mutate(gene = factor(gene, levels = c("Random genes", target_genes_qc)))
mean_1000_genes <- mean(expression_data_sample_mean$expression_value_sample)
# head(expression_long2)
#
p1 <- ggpubr::ggboxplot(expression_long2,
x = "gene",
y = "expression_value",
color = uithof_color[16],
add = "jitter",
add.params = list(size = 3, jitter = 0.3),
ylab = paste0("normalized expression ")
) +
geom_hline(yintercept = mean_1000_genes, linetype = "dashed", color = uithof_color[26], size = 1.25) +
theme(axis.text.x = element_text(size = 18, angle = 45, hjust = 1, vjust = 1), # change orientation of x-axis labels
axis.text.y = element_text(size = 18),
axis.title.x = element_text(size = 20),
axis.title.y = element_text(size = 20))
p1
ggsave(filename = paste0(PLOT_loc, "/", Today, ".TargetExpression_vs_1000genes.png"), plot = last_plot())Saving 18 x 12 in image
ggsave(filename = paste0(PLOT_loc, "/", Today, ".TargetExpression_vs_1000genes.pdf"), plot = last_plot())Saving 18 x 12 in image
rm(p1 )If we would put these correlations in one simple and comprehensible figure, we could use a correlation heatmap. Again, correlation coefficients used here are Spearman’s.
Figure 4: correlation heatmap between expression of genes of interest
library(tidyverse)
library(magrittr)
temp <- expression_wide %>%
column_to_rownames("study_number") %>%
dplyr::select(target_genes_qc) %>%
drop_na() %>% # drop NA
Filter(function(x) sd(x) != 0, .) # filter variables with sd = 0
temp.cor <- cor(temp, method = "spearman")
p1 <- pheatmap(data.matrix(temp.cor),
scale = "none",
cluster_rows = TRUE,
cluster_cols = TRUE,
legend = TRUE,
fontsize = 18)
p1
ggsave(filename = paste0(PLOT_loc, "/", Today, ".correlations.target_genes.pdf"), plot = p1, height = 15, width = 15)
ggsave(filename = paste0(PLOT_loc, "/", Today, ".correlations.target_genes.png"), plot = p1, height = 15, width = 15)
rm(temp, temp.cor, p1)We are saving the final list of genes of interest
temp <- subset(expression_data_sel, select = c("gene_ensembl", "symbol"))
fwrite(temp,
file = paste0(OUT_loc, "/", Today, ".target_list.qc.txt"),
sep = " ", row.names = FALSE, col.names = TRUE,
showProgress = TRUE)
rm(temp)We will create a list of samples that should be included based on
CEA, and having the proper informed consent (‘academic’). We will save
the SummarizedExperiment as a RDS file for easy loading.
The clinical data will also be saved as a separate
txt-file.
temp <- as.tibble(subset(colData(AERNASE), select = c("STUDY_NUMBER", "SampleType", "RNAseqType")))
# fwrite(temp,
# file = paste0(OUT_loc, "/", Today, ".AERNA.CEA.608pts.samplelist.after_qc.IC_commercial.csv"),
# sep = ",", row.names = FALSE, col.names = TRUE,
# showProgress = TRUE)
# rm(temp)
#
# temp <- as.tibble(colData(AERNASE))
#
# fwrite(temp,
# file = paste0(OUT_loc, "/", Today, ".AERNA.CEA.608pts.clinicaldata.after_qc.IC_commercial.csv"),
# sep = ",", row.names = FALSE, col.names = TRUE,
# showProgress = TRUE)
# rm(temp)
fwrite(temp,
file = paste0(OUT_loc, "/", Today, ".AERNA.CEA.611pts.samplelist.after_qc.IC_academic.csv"),
sep = ",", row.names = FALSE, col.names = TRUE,
showProgress = TRUE)
rm(temp)
temp <- as.tibble(colData(AERNASE))
fwrite(temp,
file = paste0(OUT_loc, "/", Today, ".AERNA.CEA.611pts.clinicaldata.after_qc.IC_academic.csv"),
sep = ",", row.names = FALSE, col.names = TRUE,
showProgress = TRUE)
rm(temp)
AERNASEclass: RangedSummarizedExperiment
dim: 59851 611
metadata(1): ''
assays(1): counts
rownames(59851): ENSG00000000003 ENSG00000000419 ... ENSG00000279119 ENSG00000271242
rowData names(3): feature_id symbol GENEBIOTYPE_EnsDb86
colnames(611): ae1 ae1026 ... ae998 ae999
colData names(58): STUDY_NUMBER SampleType ... OverallPlaquePhenotype Plaque_Vulnerability_Index
# saveRDS(AERNASE, file = paste0(OUT_loc, "/", Today, ".AERNA.CEA.608pts.SE.after_qc.IC_commercial.RDS"))
saveRDS(AERNASE, file = paste0(OUT_loc, "/", Today, ".AERNA.CEA.611pts.SE.after_qc.IC_academic.RDS"))Version: v1.0.0
Last update: 2022-03-17
Written by: Sander W. van der Laan (s.w.vanderlaan-2[at]umcutrecht.nl).
Description: Script to load bulk RNA sequencing data, and perform gene expression analyses, and visualisations.
Minimum requirements: R version 3.5.2 (2018-12-20) -- 'Eggshell Igloo', macOS Mojave (10.14.2).
**MoSCoW To-Do List**
The things we Must, Should, Could, and Would have given the time we have.
_M_
_S_
_C_
_W_
**Changes log**
* v1.0.0 Inital version. Update to the count data, gene list. Filter samples based on artery operated (CEA) and informed consent. Added heatmap of correlation between target genes.
sessionInfo()R version 4.1.2 (2021-11-01)
Platform: x86_64-apple-darwin17.0 (64-bit)
Running under: macOS Monterey 12.2.1
Matrix products: default
LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib
locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
attached base packages:
[1] stats4 grid tools stats graphics grDevices utils datasets methods base
other attached packages:
[1] labelled_2.9.0 annotables_0.1.91 EnhancedVolcano_1.12.0 ggrepel_0.9.1
[5] EnsDb.Hsapiens.v86_2.99.0 ensembldb_2.18.2 AnnotationFilter_1.18.0 TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
[9] mygene_1.30.0 org.Hs.eg.db_3.14.0 DESeq2_1.34.0 SummarizedExperiment_1.24.0
[13] MatrixGenerics_1.6.0 matrixStats_0.61.0 GenomicFeatures_1.46.3 AnnotationDbi_1.56.2
[17] Biobase_2.54.0 GenomicRanges_1.46.1 GenomeInfoDb_1.30.0 IRanges_2.28.0
[21] S4Vectors_0.32.3 BiocGenerics_0.40.0 UpSetR_1.4.0 ggpubr_0.4.0
[25] forestplot_2.0.1 checkmate_2.0.0 magrittr_2.0.1 pheatmap_1.0.12
[29] devtools_2.4.3 usethis_2.1.5 BlandAltmanLeh_0.3.1 sjPlot_2.8.10
[33] tableone_0.13.0 haven_2.4.3 openxlsx_4.2.5 eeptools_1.2.4
[37] DT_0.20 knitr_1.37 forcats_0.5.1 stringr_1.4.0
[41] purrr_0.3.4 tibble_3.1.6 ggplot2_3.3.5 tidyverse_1.3.1
[45] data.table_1.14.2 naniar_0.6.1 tidyr_1.1.4 dplyr_1.0.7
[49] optparse_1.7.1 readr_2.1.1 pander_0.6.4 rmarkdown_2.11
[53] worcs_0.1.9.1
loaded via a namespace (and not attached):
[1] estimability_1.3 rappdirs_0.3.3 rtracklayer_1.54.0 rticles_0.22 coda_0.19-4 ragg_1.2.1
[7] visdat_0.5.3 bit64_4.0.5 multcomp_1.4-18 DelayedArray_0.20.0 rpart_4.1.16 KEGGREST_1.34.0
[13] RCurl_1.98-1.5 generics_0.1.1 callr_3.7.0 TH.data_1.1-0 RSQLite_2.2.9 proxy_0.4-26
[19] chron_2.3-56 bit_4.0.4 tzdb_0.2.0 xml2_1.3.3 lubridate_1.8.0 ggsci_2.9
[25] assertthat_0.2.1 xfun_0.29 hms_1.1.1 evaluate_0.14 fansi_1.0.0 restfulr_0.0.13
[31] progress_1.2.2 dbplyr_2.1.1 readxl_1.3.1 DBI_1.1.2 geneplotter_1.72.0 htmlwidgets_1.5.4
[37] ellipsis_0.3.2 backports_1.4.1 insight_0.16.0 survey_4.1-1 annotate_1.72.0 biomaRt_2.50.2
[43] vctrs_0.3.8 remotes_2.4.2 sjlabelled_1.1.8 abind_1.4-5 cachem_1.0.6 withr_2.4.3
[49] sys_3.4 emmeans_1.7.2 vcd_1.4-9 GenomicAlignments_1.30.0 prettyunits_1.1.1 getopt_1.20.3
[55] cluster_2.1.2 lazyeval_0.2.2 crayon_1.4.2 genefilter_1.76.0 labeling_0.4.2 pkgconfig_2.0.3
[61] vipor_0.4.5 nlme_3.1-155 pkgload_1.2.4 ProtGenerics_1.26.0 nnet_7.3-17 rlang_0.4.12
[67] lifecycle_1.0.1 sandwich_3.0-1 filelock_1.0.2 extrafontdb_1.0 BiocFileCache_2.2.0 modelr_0.1.8
[73] ggrastr_1.0.1 cellranger_1.1.0 rprojroot_2.0.2 lmtest_0.9-39 datawizard_0.3.0 Matrix_1.4-0
[79] carData_3.0-5 boot_1.3-28 zoo_1.8-9 beeswarm_0.4.0 base64enc_0.1-3 reprex_2.0.1
[85] processx_3.5.2 png_0.1-7 rjson_0.2.21 parameters_0.17.0 bitops_1.0-7 KernSmooth_2.23-20
[91] Biostrings_2.62.0 blob_1.2.2 maptools_1.1-2 arm_1.12-2 jpeg_0.1-9 rstatix_0.7.0
[97] ggeffects_1.1.1 ggsignif_0.6.3 scales_1.1.1 memoise_2.0.1 plyr_1.8.6 zlibbioc_1.40.0
[103] compiler_4.1.2 BiocIO_1.4.0 ash_1.0-15 RColorBrewer_1.1-2 lme4_1.1-27.1 Rsamtools_2.10.0
[109] cli_3.1.0 XVector_0.34.0 ps_1.6.0 htmlTable_2.4.0 Formula_1.2-4 MASS_7.3-54
[115] tidyselect_1.1.1 stringi_1.7.6 textshaping_0.3.6 proj4_1.0-10.1 mitools_2.4 yaml_2.2.1
[121] askpass_1.1 locfit_1.5-9.4 latticeExtra_0.6-29 credentials_1.3.2 parallel_4.1.2 rstudioapi_0.13
[127] foreign_0.8-82 gridExtra_2.3 farver_2.1.0 digest_0.6.29 proto_1.0.0 gert_1.5.0
[133] Rcpp_1.0.7 prereg_0.5.0 car_3.0-12 broom_0.7.11 ggalt_0.4.0 performance_0.8.0
[139] httr_1.4.2 effectsize_0.6.0.1 sjstats_0.18.1 colorspace_2.0-2 rvest_1.0.2 XML_3.99-0.8
[145] fs_1.5.2 ranger_0.13.1 splines_4.1.2 sp_1.4-6 systemfonts_1.0.3 sessioninfo_1.2.2
[151] xtable_1.8-4 jsonlite_1.7.2 nloptr_1.2.2.3 testthat_3.1.1 R6_2.5.1 Hmisc_4.6-0
[157] gsubfn_0.7 pillar_1.6.4 htmltools_0.5.2 glue_1.6.0 fastmap_1.1.0 minqa_1.2.4
[163] BiocParallel_1.28.3 class_7.3-20 codetools_0.2-18 maps_3.4.0 pkgbuild_1.3.1 mvtnorm_1.1-3
[169] utf8_1.2.2 lattice_0.20-45 sqldf_0.4-11 ggbeeswarm_0.6.0 curl_4.3.2 Rttf2pt1_1.3.9
[175] zip_2.2.0 openssl_1.4.6 survival_3.3-1 desc_1.4.0 munsell_0.5.0 e1071_1.7-9
[181] GenomeInfoDbData_1.2.7 sjmisc_2.8.9 gtable_0.3.0 extrafont_0.17 bayestestR_0.11.5
save.image(paste0(PROJECT_loc, "/",Today,".",PROJECTNAME,".bulkRNAseq.preparation.RData"))| © 1979-2022 Sander W. van der Laan | s.w.vanderlaan[at]gmail.com swvanderlaan.github.io. |